#!/bin/bash
#BSUB -n 1                     # 1 cores
#BSUB -W 0:20                   # 3-minutes run-time
#BSUB -R "rusage[mem=16000]"     # 32 GB per core
#BSUB -R "rusage[ngpus_excl_p=1]"
#BSUB -R "select[gpu_model0==NVIDIATITANRTX]"
#BSUB -o /cluster/home/biyuan/exe_log/gpt3small_1gpu_3node.out.%J
#BSUB -e /cluster/home/biyuan/exe_log/gpt3small_1gpu_3node.err.%J

module load gcc/6.3.0 cuda/11.0.3             # Load modules from Euler setup
source activate base                          # Activate my conda python environment
cd /nfs/iiscratch-zhang.inf.ethz.ch/export/zhang/export/fm/GPT-home-private      # Change directory

nvidia-smi

world_size=3
DIST_CONF="--pp-mode gpipe --world-size $world_size --pipeline-group-size $world_size --data-group-size 1 --cuda-id 0"
MODEL_CONF="--fp16 --embedding-dim 2048 --num-heads 16 --num-layers 6 --batch-size 64 --micro-batch-size 1"
COOR_CONF="--coordinator-server-ip 129.132.93.84"

export NCCL_SOCKET_IFNAME=access
export GLOO_SOCKET_IFNAME=access

##python dist_training_runner_w_coordinator.py $DIST_CONF $MODEL_CONF $COOR_CONF
python dist_training_runner_w_euler_coordinator.py $DIST_CONF $MODEL_CONF $COOR_CONF --lsf-job-no